Code
library(tidyverse)
library(plotly)
library(dplyr)
library(ggplot2)Jorge Bris Moreno
March 20, 2024
This is the head of the data:
# A tibble: 6 × 13
Product Type `Release Date` `Process Size (nm)` `TDP (W)` `Die Size (mm^2)`
<chr> <chr> <chr> <chr> <chr> <chr>
1 AMD Athl… CPU 6/5/00 180 54 120
2 AMD Athl… CPU 10/31/00 180 54 120
3 AMD Athl… CPU 8/14/00 180 60 120
4 AMD Athl… CPU 10/31/00 180 63 120
5 AMD Athl… CPU 10/31/00 180 66 120
6 AMD Athl… CPU 10/17/00 180 66 120
# ℹ 7 more variables: `Transistors (million)` <chr>, `Freq (GHz)` <dbl>,
# Foundry <chr>, Vendor <chr>, `FP16 GFLOPS` <dbl>, `FP32 GFLOPS` <dbl>,
# `FP64 GFLOPS` <dbl>
Data seems almost clean, we will change some of the data types and put FP GFLOPS as one column and then it will completely clean.
suppressWarnings({
df$Type <- as.factor(df$Type)
df$`Process Size (nm)` <- as.numeric(df$`Process Size (nm)`)
df$`TDP (W)`<- as.numeric(df$`TDP (W)`)
df$`Die Size (mm^2)`<- as.numeric(df$`Die Size (mm^2)`)
df$`Transistors (million)` <- as.numeric(df$`Transistors (million)`)
df$`Release Date` <- as.Date(df$`Release Date`, format = "%m/%d/%y")})This is the new summary of the data:
Product Type Release Date Process Size (nm)
Length:4945 CPU:2231 Min. :1999-03-15 Min. : 0.0
Class :character GPU:2714 1st Qu.:2007-04-01 1st Qu.: 14.0
Mode :character Median :2012-09-04 Median : 32.0
Mean :2012-09-04 Mean : 51.3
3rd Qu.:2018-04-11 3rd Qu.: 80.0
Max. :2024-01-08 Max. :250.0
NA's :199 NA's :58
TDP (W) Die Size (mm^2) Transistors (million) Freq (GHz)
Min. : 1.00 Min. : 6 Min. : 8.0 Min. : 100
1st Qu.: 35.00 1st Qu.: 114 1st Qu.: 163.5 1st Qu.: 650
Median : 65.00 Median : 169 Median : 950.0 Median :1400
Mean : 92.54 Mean : 213 Mean : 4836.4 Mean :1615
3rd Qu.: 119.00 3rd Qu.: 257 3rd Qu.: 4050.0 3rd Qu.:2500
Max. :2400.00 Max. :1280 Max. :153000.0 Max. :4700
NA's :891 NA's :716 NA's :806 NA's :437
Foundry Vendor FP16 GFLOPS FP32 GFLOPS
Length:4945 Length:4945 Min. : 10 Min. : 12.8
Class :character Class :character 1st Qu.: 1300 1st Qu.: 384.0
Mode :character Mode :character Median : 6136 Median : 1248.0
Mean : 19033 Mean : 5403.0
3rd Qu.: 20175 3rd Qu.: 5069.0
Max. :653700 Max. :93240.0
NA's :4145 NA's :3260
FP64 GFLOPS
Min. : 3.60
1st Qu.: 59.25
Median : 136.35
Mean : 1096.61
3rd Qu.: 382.45
Max. :81720.00
NA's :3667
Now, data seems clean for our purposes. We will not discard any NA’s yet as we will do that as we plot, since we don’t want to loose any values for certain plots.
suppressWarnings({
fig <- plot_ly(data = df, type = 'scatter', mode = 'markers',
x = ~`Release Date`,
y = ~`Transistors (million)`,
color = ~`Process Size (nm)`,
size = ~`Die Size (mm^2)`) %>%
layout(title = 'Transistor Counts Over Time with Process Size',
xaxis = list(title = 'Release Date', range = c('1990-03-13','2030-01-09')),
yaxis = list(title = 'Transistors (million)', type = 'log'),
coloraxis = list(colorscale = 'inferno'),
hovermode = 'closest')
fig
})suppressWarnings({
df <- df %>%
mutate(TransistorDensity = (`Transistors (million)` * 1e6) / `Die Size (mm^2)`)
fig <- plot_ly(data = df, x = ~`Release Date`, y = ~TransistorDensity,
type = 'scatter', mode = 'markers',
color = ~Type,
colors = c("CPU" = "burlywood1", "GPU" = "darkslategray2"),
marker = list(
size = 10,
opacity = 0.7
)) %>%
layout(title = 'Transistor Density Over Time',
xaxis = list(title = 'Release Date'),
yaxis = list(title = 'Transistor Density (per mm^2)', type = 'log'),
hovermode = 'closest',
legend = list(title = list(text = 'Type')))
fig
})Follows a similar procedure as before. Outputs of the cleaning process will not be printed but the code is available.
df['Type'] = df['Type'].astype('category')
df['Process Size (nm)'] = pd.to_numeric(df['Process Size (nm)'], errors='coerce')
df['TDP (W)'] = pd.to_numeric(df['TDP (W)'], errors='coerce')
df['Die Size (mm^2)'] = pd.to_numeric(df['Die Size (mm^2)'], errors='coerce')
df['Transistors (million)'] = pd.to_numeric(df['Transistors (million)'], errors='coerce')
df['Release Date'] = pd.to_datetime(df['Release Date'], format='%m/%d/%y')
# df.info()Now, our data seems clean and we will proceed to the plots.
df_fp32 = df.dropna(subset=['FP32 GFLOPS'])
min_size, max_size = 5, 15
die_sizes = df_fp32['Die Size (mm^2)']
normalized_sizes = (die_sizes - die_sizes.min()) / (die_sizes.max() - die_sizes.min())
normalized_sizes = normalized_sizes * (max_size - min_size) + min_size
normalized_sizes = normalized_sizes.fillna(min_size)
traces = []
for device_type in ['CPU', 'GPU']:
df_filtered = df_fp32[df_fp32['Type'] == device_type]
sizes_for_filtered_df = normalized_sizes[df_filtered.index]
trace = go.Scatter(
x=df_filtered['Release Date'],
y=df_filtered['FP32 GFLOPS'],
mode='markers',
marker=dict(
size=sizes_for_filtered_df,
opacity=0.7,
),
name=device_type
)
traces.append(trace)
fig = go.Figure(
data=traces,
layout=go.Layout(
title='Computing Performance (FP32 GFLOPS) Over Time',
xaxis=dict(title='Release Year'),
yaxis=dict(title='FP32 GFLOPS', type='log'),
template='plotly_white'
)
)
fig.show()df['Efficiency (GFLOPS/Watt)'] = df['FP32 GFLOPS'] / df['TDP (W)']
df['Release Year'] = df['Release Date'].dt.year
process_size_norm = (df['Process Size (nm)'] - df['Process Size (nm)'].min()) / (df['Process Size (nm)'].max() - df['Process Size (nm)'].min())
colorscale = [[0, 'blue'], [1, 'red']]
traces = []
for device_type in ['CPU', 'GPU']:
df_filtered = df[df['Type'] == device_type]
trace = go.Scatter(
x=df_filtered['Release Year'],
y=df_filtered['Efficiency (GFLOPS/Watt)'],
mode='markers',
marker=dict(
size=10,
color=process_size_norm[df_filtered.index],
colorscale=colorscale,
colorbar=dict(title='Normalized Process Size'),
showscale=True
),
name=device_type,
text=df_filtered['Process Size (nm)']
)
traces.append(trace)
fig = go.Figure(
data=traces,
layout=go.Layout(
title='Computational Efficiency Over Time (FP32 GFLOPS per Watt)',
xaxis=dict(title='Release Year'),
yaxis=dict(title='Efficiency (GFLOPS/Watt)', type='log'),
template='plotly_white'
)
)
fig.show()